Dot Pattern Variability
  • Prior Work
    • 2022_Paper
    • Hu_Nosofsky24
  • Dot Patterns
  • Dot Similarity
  • Task

On this page

  • Data Inspection & Sanity Checks
    • Prototype set counts
    • Rating Distributions
    • Reaction Time Distributions
  • Lowest and Highest Rated Pairs
  • All pairs with >=10 ratings
    • Correlations with Accuracy in Hu & Nosofsky 2024
  • View source
  • Edit this page
  • Report an issue

Dot Pattern Similarity

  • Show All Code
  • Hide All Code

  • View Source
Published

March 25, 2024

prototype sets refer to sets of 3 prototypes that correspond to a single participant in the Hu & Nosofsky 2024 study. prototype pairs refer to the specific pairs of prototypes that are displayed together on a rating trial (3 pairs per set).

Code
pacman::p_load(dplyr,purrr,tidyr,ggplot2, here, patchwork, 
  conflicted, jsonlite,stringr, gt, knitr, kableExtra, lubridate,ggh4x)
conflict_prefer_all("dplyr", quiet = TRUE)
options(digits=2, scipen=999, dplyr.summarise.inform=FALSE)
walk(c("fun_plot"), ~ source(here::here(paste0("R/", .x, ".R"))))
mc24_proto <- read.csv(here("Stimulii","mc24_prototypes.csv")) |> mutate(set=paste0(sbjCode,"_",condit)) 
sbj_cat <- read.csv(here("data","mc24_sbj_cat.csv"))

dfiles <- list(path=list.files(here::here("data/dotSim_data"),full.names=TRUE))

d <- map_dfr(dfiles$path, ~read.csv(.x))

d <- map_dfr(dfiles$path, ~{read.csv(.x) |> 
    mutate(sfile=tools::file_path_sans_ext(basename(.x)))}) |> 
  select(-trial_index, -internal_node_id,-trial_type) |>
   mutate(set = paste(str_extract(item_label_1, "^\\d+"),
                     str_extract(item_label_1, "[a-z]+"), sep = "_")) |>
  mutate(pair_label = paste0(item_label_1,"_",item_label_2)) |>
  relocate(sbjCode,date,set,pair_label,trial,item_label_1,item_label_2,response,rt)

setCounts <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(set) |> summarise(n=n_distinct(sbjCode),resp=mean(response),sd=sd(response)) |> arrange(desc(n))

# length(unique(mc_proto$set)) # 304
setCounts2 <- mc24_proto |> group_by(set) |> 
  slice_head(n=1) |> 
  select(id,file,set) |> 
  left_join(setCounts,by="set") |> 
  mutate(n = ifelse(is.na(n), 0, n), .groups="drop") |> 
  arrange(n) |> ungroup()

pairCounts <- d |> 
  group_by(pair_label,set) |> 
  summarise(n=n(),mean_resp=mean(response),sd=sd(response)) |> arrange(desc(n)) |> ungroup()

Data Inspection & Sanity Checks

Code
avg_set_rating <- setCounts2 |> summarise("Avg Ratings Per Set" = mean(n)) |> pull(1)

d |> 
  summarize("N Subjects" = n_distinct(sbjCode), "N Prototype Sets" = n_distinct(set)) |> 
  mutate("Avg Ratings Per Set" = avg_set_rating) |>
  kbl()
Table 1: Current counts of unique subjects, and prototype sets
N Subjects N Prototype Sets Avg Ratings Per Set
29 304 9.6
Code
# d |> summarize(n=n(), n_distinct(sbjCode), n_distinct(file), n_distinct(set), n_distinct(trial), n_distinct(item_label_1), n_distinct(item_label_2))






# d %>%
#   filter(sbjCode == 11) %>%
#   select(sbjCode, date, trial, set, rt, time) %>%
#   mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) %>%
#   group_by(sbjCode, date) %>%
#   summarise(start_time = min(time_parsed), end_time = max(time_parsed)) %>%
#   mutate(endTimeMinusStart = end_time - start_time)

plot_hist_sbj <- function(id) {
  d |> filter(sbjCode==id) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}


sbj_sum <- d |> group_by(sbjCode) |> 
#filter(sbjCode<5) |>
  mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) |>
  summarize ("Mean Rating"=mean(response),
  "SD Rating"=sd(response), 
  "Mean RT"=mean(rt)/1000, 
  #"Total Time (min)" = max(time_elapsed)/60000,
  "Total Time (min)" = round(difftime(max(time_parsed), min(time_parsed), units = "mins"),1),
  n_prototype_sets = n_distinct(set), 
  "N Trials" = n_distinct(trial)) |> 
  mutate("Response_Distribution"=sbjCode) 

  sbj_sum |> gt() |> 
    text_transform(
    locations = cells_body(columns = 'Response_Distribution'),
    fn = function(column) {
      map(column, plot_hist_sbj) |>
        ggplot_image(height = px(80), aspect_ratio = 3)
    }
    )
sbj_sum |> mutate(total_time=as.numeric(`Total Time (min)`)) |> 
  rename("Subject"=sbjCode, "N_Sets" = n_prototype_sets) |> 
  summarize("Average Completion Time (min)" = mean(total_time), "Min Completion Time (min)" = min(total_time), "Max Completion Time (min)" = max(total_time)) |> kbl()
Table 2: Individual Subject Ratings
sbjCode Mean Rating SD Rating Mean RT Total Time (min) n_prototype_sets N Trials Response_Distribution
2 6.3 2.0 2.22 14.7 101 303
3 4.7 1.6 1.62 11.6 101 303
4 4.5 1.6 3.52 21.1 101 303
5 4.0 2.0 2.02 13.5 101 303
7 5.1 2.8 2.79 17.4 101 303
8 5.0 2.4 2.10 13.9 101 303
9 4.7 2.1 2.81 17.2 101 303
10 4.6 1.7 2.45 15.8 101 303
11 2.8 1.8 5.80 32.4 101 303
12 4.3 2.6 3.76 22.2 101 303
13 5.8 2.4 1.21 9.5 101 303
14 4.5 1.8 3.74 22 101 303
15 5.1 2.3 2.11 14 101 303
16 6.1 2.1 1.62 11.5 101 303
17 5.5 2.7 1.05 8.7 101 303
18 2.5 2.5 2.13 14.2 101 303
19 4.4 2.3 1.91 13 101 303
20 5.7 1.6 2.18 14.4 101 303
21 4.6 2.3 3.93 23.1 101 303
22 4.4 1.6 3.44 20.6 101 303
23 3.3 2.5 2.00 13.5 101 303
24 4.7 2.3 1.51 11.1 101 303
25 5.2 1.6 3.06 18.8 101 303
26 4.7 2.0 1.83 12.6 101 303
27 4.5 2.6 2.69 17 101 303
28 4.7 2.1 6.25 34.9 101 303
29 4.9 2.2 2.75 17.2 101 303
30 5.3 1.9 0.65 6.6 101 303
31 6.6 2.4 1.14 9 101 303
Average Completion Time (min) Min Completion Time (min) Max Completion Time (min)
16 6.6 35

Prototype set counts

Code
# d |> filter(sbjCode==11) |> select(sbjCode,date,trial,pair_label,set,rt,time_elapsed,time)

# d |> group_by(sbjCode,set) |> 
#   summarize (n=n()) |>
#   gt()

#d |> group_by(sbjCode, item_label_1, item_label_2) |> summarise(n=n())

# (1-.33)^8
# (factorial(8)/(factorial(6)*factorial(8-6))) * (.33^6)*((1-.33)^(8-6))
# (factorial(8)/(factorial(7)*factorial(8-7))) *(.33^6)*((1-.33)^(8-7))
# (factorial(8)/(factorial(8)*factorial(8-8))) *(.33^6)*(1-.33)^(8-8)

# d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(sbjCode, item) |> summarise(n=n())

# patternCounts <- d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(item) |> summarise(n=n(),resp=mean(response),sd=sd(response)) |> arrange(desc(n))



# d |> 
#     pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> select(sbjCode,set,pair_label,item_label,item,response) |>  group_by(set) |>
#     summarize(n=n_distinct(sbjCode)) |> arrange(desc(n)) 




# d |> group_by(sbjCode, file) |> summarise(n=n())
# d |> group_by(sbjCode, set) |> summarise(n=n())

# d |> group_by(sbjCode) |> summarise(n_distinct(file))
# d |> group_by(sbjCode) |> summarise(n_distinct(set))


sp <- setCounts2 |> 
  mutate(set=reorder(set,n)) |>
  ggplot(aes(x=set,y=n)) +
   geom_col() +
   theme(legend.title=element_blank(),
      axis.text.x = element_text(size=5,angle = 90, hjust = 0.5, vjust = 0.5)) +
    labs(x="Prototype Set", y="Number of Participants to rate set") 

sh <- setCounts2 |> 
  ggplot(aes(x=n)) + geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks=seq(0, max(setCounts2$n), by = 1)) +
  geom_text(stat="count", aes(label=..count..), vjust=-0.5) +
  labs(x="Number of times prototype set has been included in the study", 
  y="Number of prototype sets for each count") 


sp/sh

Prototype set counts

Prototype set counts

 

Code
setCounts2 |> group_by(n) |> summarise(nc=n()) |> rename("Number of times prototype set has been included in the study"=n, "Number of prototype sets with this count"=nc) |> gt() |> 
  tab_spanner(label = "Prototype Set Counts") |> 
  tab_header(title = "Prototype Set Counts") |> 
  tab_source_note(
    "Note: The number of times a prototype set has been included in the study is the number of participants who rated the set."
  )
Prototype Set Counts
Number of times prototype set has been included in the study Number of prototype sets with this count
3 3
4 7
5 8
6 17
7 30
8 37
9 44
10 48
11 44
12 20
13 18
14 17
15 5
16 5
17 1
Note: The number of times a prototype set has been included in the study is the number of participants who rated the set.

 

Rating Distributions

Code
pgr <- d |> 
  ggplot(aes(x=response))+geom_histogram(binwidth=1) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Aggregate Rating Distribution", x="Rating", y="Count") 

pir <- d |>  ggplot(aes(x=response))+
      geom_histogram(binwidth=1) + 
      facet_wrap(~sbjCode) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Rating Distribution per Sbj.", x="Rating", y="Count") 

pgr/pir

Rating distributions

Rating distributions

Reaction Time Distributions

Code
prtg <- d |> ggplot(aes(x=rt))+
  geom_density() + 
  labs(title="Aggregate Reaction Time Distribution", x="Reaction Time (ms)", y="Density")

prtid <- d |> ggplot(aes(x=rt))+geom_density() + 
  facet_wrap(~sbjCode,scale="free_x") + labs(title="Reaction Time Distribution per Sbj.", x="Reaction Time (ms)", y="Density")

prtg/prtid

Reaction time distributions

Reaction time distributions

Lowest and Highest Rated Pairs

Code
# patternCounts |> filter(n>=8) |>  slice_min(resp)
# patternCounts |> filter(n>=8) |>  slice_max(resp)

# setCounts |> filter(n>=24) |>  slice_min(resp)
# setCounts |> filter(n>=24) |>  slice_max(resp)


# pairCounts |> filter(n>=5) |>  slice_min(resp,n=2)
# pairCounts |> filter(n>=5) |>  slice_max(resp)

min_resp=7
n_show=3

d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_min(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() + 
  plot_annotation(title=glue::glue("Lowest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))

Code
d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_max(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() +  
  plot_annotation(title=glue::glue("Highest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))

All pairs with >=10 ratings

  • click on column headers to change sort order
    • e.g. clicking on “Mean Rating” will toggle showing the pairs rated most similar or most dissimilar
    • clicking on “SD” will toggle showing the pairs with the most or least agreement in ratings
Code
pat_table_plot <- function(Pair){

  df <- d |> filter(pair_label==Pair) |> slice_head(n=1) 

    pat1 <- df %>%
          mutate(pattern_1 = purrr::map(pattern_1, jsonlite::fromJSON)) %>%
          unnest(pattern_1) %>%
          mutate(y=-y, pat=item_label_1) |> select(pair_label,x,y,pat)

    pat2 <- df %>%
          mutate(pattern_2 = purrr::map(pattern_2, jsonlite::fromJSON)) %>%
          unnest(pattern_2) %>%
          mutate(y=-y, pat=item_label_2) |> select(pair_label,x,y,pat)

    pat <- rbind(pat1,pat2)

     pat |> 
    ggplot(aes(x = x, y = y)) +
          geom_point(alpha=2) +
          coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
          theme_minimal() +
          facet_wrap(~pat,ncol=2) + 
          #theme_blank +
          theme_void() + 
          theme(strip.text = element_text(size = 7,hjust=.5),
                panel.spacing.x=unit(-7.3, "lines")) 
}



p5 <- pairCounts |> filter(n>=10) 


p5 |> 
relocate(pair_label,.after=sd) |>
rename("Pair"=pair_label, "N"=n, "Mean Rating"=mean_resp, "SD"=sd) |>
#group_by(Pair) |> 
gt() |> 
tab_options(table.font.size = px(8L)) |>
  cols_width(
    set ~ px(116),
    Pair ~ px(415),
    `N` ~ px(50),
    `Mean Rating` ~ px(90),
    SD ~ px(55)
  ) |>  
  fmt_number(decimals = 1) |> #fmt_integer() |>
  cols_align('left', columns = set) |> 
  text_transform(
    locations = cells_body(columns = Pair),
    fn = function(column) {
      map(column, pat_table_plot) |>
        ggplot_image(height = px(230), aspect_ratio = 1.8)
    }
  ) |>  
  opt_interactive(page_size_default=5, 
    use_page_size_select= TRUE, use_search=TRUE, use_resizers=TRUE,use_filters=TRUE, page_size_values = c(5, 10, 25, 50, 100)) 

Correlations with Accuracy in Hu & Nosofsky 2024

  • not really enough data for this yet

Assess # of patterns in various binnings - e.g. quartile, decile

Code
patternAvg <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(item,file) |> 
  summarise(n_rating=n(),resp=mean(response),sd=sd(response)) |> 
  arrange(desc(n_rating))


cat_sim <- sbj_cat |> 
  mutate(item=item_label) |>
  left_join(patternAvg,by=c("file","item"))  |> arrange(desc(n_rating)) |>
  #remove rows where n_rating is NA, or less than 4
  filter(!is.na(n_rating),n_rating>=12) |> 
  mutate(sim_group = ifelse(resp>6.0,"Very Similar",ifelse(resp<3.5,"Very Dissimilar","Medium"))) |> 
  mutate(sim_group=factor(sim_group,levels=c("Very Dissimilar","Medium","Very Similar"))) 


# bin data by rating (resp) into quartiles
t1 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 5))|>
  group_by(Quartile) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 

t2 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10))|>
  group_by(Decile) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 


t3 <- cat_sim |> 
  group_by(sim_group) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 

t1 |> kbl(caption="Quartiles")
Quartiles
Quartile Avg. Similarity Rating sd n_ratings
1 3.4 0.53 106
2 4.3 0.17 125
3 4.8 0.13 140
4 5.3 0.14 133
5 6.0 0.39 103
Code
t2 |> kbl(caption="Deciles")
Deciles
Decile Avg. Similarity Rating sd n_ratings
1 3.0 0.46 56
2 3.8 0.13 69
3 4.2 0.10 70
4 4.5 0.08 71
5 4.7 0.07 78
6 5.0 0.07 78
7 5.2 0.07 76
8 5.4 0.07 75
9 5.7 0.11 69
10 6.3 0.34 58
Code
t3 |> kbl(caption="Extreme Groups")
Extreme Groups
sim_group Avg. Similarity Rating sd n_ratings
Very Dissimilar 2.9 0.45 50
Medium 4.8 0.64 270
Very Similar 6.4 0.34 52

Combined Category Testing Performance with Similarity Ratings (resp)

  • Each subject in the 2024 study has a similarity score for each of their 3 categories. (averaged over 2 comparisons with that categories prototype)
  • The same category similarity scores are then compared to their accuracy for each of the Pattern Tyeps (old, prototype, new low, new med, new high)
Code
cat_sim %>% # round all numerics except sbjCode to 2 decimal places
 mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
  relocate(item,file, .after=sd) |>
  rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |>
   DT::datatable(options = list(pageLength = 6))
Code
p3 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_wrap(~Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Quintile", title="Effect by Pattern Type")


p4 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10)) |> 
  ggplot(aes(x=Decile,y=Corr,fill=Decile)) + 
  stat_bar + 
  facet_wrap(~Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Decile", title="Effect by Pattern Type")





p5 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_wrap(~condit) + labs(y="CatLearn Accuracy", x="Similarity Rating Quartile", title="Effect by Training Condition")


p6 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10)) |> 
  ggplot(aes(x=Decile,y=Corr,fill=Decile)) + 
  stat_bar + 
  facet_wrap(~condit) + labs(y="CatLearn Accuracy", x="Similarity Rating Decile", title="Effect by Training Condition")


p7 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_nested_wrap(~condit+Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Quintile", title="Effect by Training Condition and Pattern Type")


p3 + p4

Code
p5 + p6

Code
p7

Code
p9 <- cat_sim |> 
  ggplot(aes(y=Corr,x=Pattern.Type, fill=sim_group)) + 
  stat_bar + labs(title="Group by Pattern Type",y="CatLearn Accuracy")

p10 <- cat_sim |> 
  ggplot(aes(y=Corr,x=condit, fill=sim_group)) + 
  stat_bar + labs(title="Group by Condit",y="CatLearn Accuracy")

 p9 / p10

Code
cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=Pattern.Type)) + 
  geom_smooth(aes(fill=Pattern.Type),method = "lm") + 
 labs(y="Similarity Rating", x="Aggregated Accuracy in CatLearn study")

Code
cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=Pattern.Type)) + 
  geom_smooth(aes(fill=Pattern.Type),method = "lm") + 
  facet_wrap(~condit) + labs(y="Similarity Rating", x="Aggregated Accuracy in CatLearn study")

Code
cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=condit)) + 
  #geom_smooth(aes(fill=Pattern.Type)) + 
  facet_wrap(~Pattern.Type)

Code
# #| fig-cap: dot plots
# #| fig-width: 10
# #| fig-height: 12

# d %>% filter(trial==1) %>%
#   plot_dots()

# d %>% filter(trial==1) %>%
#   plot_dots2()

# d %>% filter(trial<2) %>%
#   plot_dotsAll()
Code
#| fig-width: 6
#| fig-height: 9


# d %>% filter(file %in% unique(d$file[1])) %>%
#   plot_dotsAll()

# plot_dotsAll_orig <- function(df) {
#   plots <- list()

#   for (i in 1:nrow(df)) {
#     p1 <- df[i, ] %>%
#       pivot_longer(cols = starts_with("x"), names_to = "dot", values_to = "x") %>%
#       mutate(dot = as.numeric(str_remove(dot, "x"))) %>%
#       pivot_longer(cols = starts_with("y"), names_to = "dot2", values_to = "y") %>%
#       mutate(dot2 = as.numeric(str_remove(dot2, "y"))) %>%
#       filter(dot == dot2) %>%
#       ggplot(aes(x = x, y = y)) +
#       geom_point() +
#       coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
#       theme_minimal() +
#       labs(title = df$id[i]) + theme_blank

#     plots <- append(plots, list(p1))
#   }

#   patchwork::wrap_plots(plots, ncol = 1)
# }

# mc24_proto |> filter(file %in% unique(d$file[1])) %>% plot_dotsAll_orig()
Source Code
---
title: Dot Pattern Similarity
date: last-modified
lightbox: true
toc: true
page-layout: full
toc-depth: 3
code-fold: true
code-tools: true
execute: 
  warning: false
  eval: true
---



prototype sets refer to sets of 3 prototypes that correspond to a single participant in the Hu & Nosofsky 2024 study. 
prototype pairs refer to the specific pairs of prototypes that are displayed together on a rating trial (3 pairs per set). 


```{r}
pacman::p_load(dplyr,purrr,tidyr,ggplot2, here, patchwork, 
  conflicted, jsonlite,stringr, gt, knitr, kableExtra, lubridate,ggh4x)
conflict_prefer_all("dplyr", quiet = TRUE)
options(digits=2, scipen=999, dplyr.summarise.inform=FALSE)
walk(c("fun_plot"), ~ source(here::here(paste0("R/", .x, ".R"))))
mc24_proto <- read.csv(here("Stimulii","mc24_prototypes.csv")) |> mutate(set=paste0(sbjCode,"_",condit)) 
sbj_cat <- read.csv(here("data","mc24_sbj_cat.csv"))

dfiles <- list(path=list.files(here::here("data/dotSim_data"),full.names=TRUE))

d <- map_dfr(dfiles$path, ~read.csv(.x))

d <- map_dfr(dfiles$path, ~{read.csv(.x) |> 
    mutate(sfile=tools::file_path_sans_ext(basename(.x)))}) |> 
  select(-trial_index, -internal_node_id,-trial_type) |>
   mutate(set = paste(str_extract(item_label_1, "^\\d+"),
                     str_extract(item_label_1, "[a-z]+"), sep = "_")) |>
  mutate(pair_label = paste0(item_label_1,"_",item_label_2)) |>
  relocate(sbjCode,date,set,pair_label,trial,item_label_1,item_label_2,response,rt)

setCounts <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(set) |> summarise(n=n_distinct(sbjCode),resp=mean(response),sd=sd(response)) |> arrange(desc(n))

# length(unique(mc_proto$set)) # 304
setCounts2 <- mc24_proto |> group_by(set) |> 
  slice_head(n=1) |> 
  select(id,file,set) |> 
  left_join(setCounts,by="set") |> 
  mutate(n = ifelse(is.na(n), 0, n), .groups="drop") |> 
  arrange(n) |> ungroup()

pairCounts <- d |> 
  group_by(pair_label,set) |> 
  summarise(n=n(),mean_resp=mean(response),sd=sd(response)) |> arrange(desc(n)) |> ungroup()



```




## Data Inspection & Sanity Checks


```{r}
#| label: tbl-totals
#| tbl-cap: "Current counts of unique subjects, and prototype sets"

avg_set_rating <- setCounts2 |> summarise("Avg Ratings Per Set" = mean(n)) |> pull(1)

d |> 
  summarize("N Subjects" = n_distinct(sbjCode), "N Prototype Sets" = n_distinct(set)) |> 
  mutate("Avg Ratings Per Set" = avg_set_rating) |>
  kbl()



```

```{r}
#| label: tbl-indv-sbj
#| tbl-cap: "Individual Subject Ratings "


# d |> summarize(n=n(), n_distinct(sbjCode), n_distinct(file), n_distinct(set), n_distinct(trial), n_distinct(item_label_1), n_distinct(item_label_2))






# d %>%
#   filter(sbjCode == 11) %>%
#   select(sbjCode, date, trial, set, rt, time) %>%
#   mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) %>%
#   group_by(sbjCode, date) %>%
#   summarise(start_time = min(time_parsed), end_time = max(time_parsed)) %>%
#   mutate(endTimeMinusStart = end_time - start_time)

plot_hist_sbj <- function(id) {
  d |> filter(sbjCode==id) |>
    ggplot(aes(x = response)) +
    geom_histogram(binwidth=1,fill = 'dodgerblue4') +
    scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) +
    theme_minimal() +
    theme(axis.title.x=element_blank(),
          axis.title.y=element_blank(),
          axis.text.x=element_text(size=26))  
}


sbj_sum <- d |> group_by(sbjCode) |> 
#filter(sbjCode<5) |>
  mutate(time_parsed = parse_date_time(paste(date, time), orders = c("mdY IMS p", "mdy IMS p"))) |>
  summarize ("Mean Rating"=mean(response),
  "SD Rating"=sd(response), 
  "Mean RT"=mean(rt)/1000, 
  #"Total Time (min)" = max(time_elapsed)/60000,
  "Total Time (min)" = round(difftime(max(time_parsed), min(time_parsed), units = "mins"),1),
  n_prototype_sets = n_distinct(set), 
  "N Trials" = n_distinct(trial)) |> 
  mutate("Response_Distribution"=sbjCode) 

  sbj_sum |> gt() |> 
    text_transform(
    locations = cells_body(columns = 'Response_Distribution'),
    fn = function(column) {
      map(column, plot_hist_sbj) |>
        ggplot_image(height = px(80), aspect_ratio = 3)
    }
    )
  

sbj_sum |> mutate(total_time=as.numeric(`Total Time (min)`)) |> 
  rename("Subject"=sbjCode, "N_Sets" = n_prototype_sets) |> 
  summarize("Average Completion Time (min)" = mean(total_time), "Min Completion Time (min)" = min(total_time), "Max Completion Time (min)" = max(total_time)) |> kbl()




```




### Prototype set counts


```{r}
#| fig-cap: Prototype set counts
#| fig-width: 12
#| fig-height: 10



# d |> filter(sbjCode==11) |> select(sbjCode,date,trial,pair_label,set,rt,time_elapsed,time)

# d |> group_by(sbjCode,set) |> 
#   summarize (n=n()) |>
#   gt()

#d |> group_by(sbjCode, item_label_1, item_label_2) |> summarise(n=n())

# (1-.33)^8
# (factorial(8)/(factorial(6)*factorial(8-6))) * (.33^6)*((1-.33)^(8-6))
# (factorial(8)/(factorial(7)*factorial(8-7))) *(.33^6)*((1-.33)^(8-7))
# (factorial(8)/(factorial(8)*factorial(8-8))) *(.33^6)*(1-.33)^(8-8)

# d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(sbjCode, item) |> summarise(n=n())

# patternCounts <- d |> pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
#   group_by(item) |> summarise(n=n(),resp=mean(response),sd=sd(response)) |> arrange(desc(n))



# d |> 
#     pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> select(sbjCode,set,pair_label,item_label,item,response) |>  group_by(set) |>
#     summarize(n=n_distinct(sbjCode)) |> arrange(desc(n)) 




# d |> group_by(sbjCode, file) |> summarise(n=n())
# d |> group_by(sbjCode, set) |> summarise(n=n())

# d |> group_by(sbjCode) |> summarise(n_distinct(file))
# d |> group_by(sbjCode) |> summarise(n_distinct(set))


sp <- setCounts2 |> 
  mutate(set=reorder(set,n)) |>
  ggplot(aes(x=set,y=n)) +
   geom_col() +
   theme(legend.title=element_blank(),
      axis.text.x = element_text(size=5,angle = 90, hjust = 0.5, vjust = 0.5)) +
    labs(x="Prototype Set", y="Number of Participants to rate set") 

sh <- setCounts2 |> 
  ggplot(aes(x=n)) + geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks=seq(0, max(setCounts2$n), by = 1)) +
  geom_text(stat="count", aes(label=..count..), vjust=-0.5) +
  labs(x="Number of times prototype set has been included in the study", 
  y="Number of prototype sets for each count") 


sp/sh


```

\ \

```{r}
setCounts2 |> group_by(n) |> summarise(nc=n()) |> rename("Number of times prototype set has been included in the study"=n, "Number of prototype sets with this count"=nc) |> gt() |> 
  tab_spanner(label = "Prototype Set Counts") |> 
  tab_header(title = "Prototype Set Counts") |> 
  tab_source_note(
    "Note: The number of times a prototype set has been included in the study is the number of participants who rated the set."
  )
```

\ \

### Rating Distributions


```{r}
#| fig-cap: Rating distributions
#| fig-width: 11
#| fig-height: 9

pgr <- d |> 
  ggplot(aes(x=response))+geom_histogram(binwidth=1) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Aggregate Rating Distribution", x="Rating", y="Count") 

pir <- d |>  ggplot(aes(x=response))+
      geom_histogram(binwidth=1) + 
      facet_wrap(~sbjCode) + 
      scale_x_continuous(breaks=seq(1, 9, by = 1)) +
    coord_cartesian(xlim = c(1, 9)) + labs(title="Rating Distribution per Sbj.", x="Rating", y="Count") 

pgr/pir

```


### Reaction Time Distributions

```{r}
#| fig-cap: Reaction time distributions
#| fig-width: 11
#| fig-height: 9


prtg <- d |> ggplot(aes(x=rt))+
  geom_density() + 
  labs(title="Aggregate Reaction Time Distribution", x="Reaction Time (ms)", y="Density")

prtid <- d |> ggplot(aes(x=rt))+geom_density() + 
  facet_wrap(~sbjCode,scale="free_x") + labs(title="Reaction Time Distribution per Sbj.", x="Reaction Time (ms)", y="Density")

prtg/prtid

```







## Lowest and Highest Rated Pairs

```{r}
#| fig-width: 9
#| fig-height: 8

# patternCounts |> filter(n>=8) |>  slice_min(resp)
# patternCounts |> filter(n>=8) |>  slice_max(resp)

# setCounts |> filter(n>=24) |>  slice_min(resp)
# setCounts |> filter(n>=24) |>  slice_max(resp)


# pairCounts |> filter(n>=5) |>  slice_min(resp,n=2)
# pairCounts |> filter(n>=5) |>  slice_max(resp)

min_resp=7
n_show=3

d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_min(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() + 
  plot_annotation(title=glue::glue("Lowest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))


d %>% filter(pair_label %in% {pairCounts |> filter(n>=min_resp) |>  
  slice_max(mean_resp,n=n_show, with_ties=FALSE) |> pull(pair_label)} ) |>
  group_by(pair_label) |>
  slice_head(n=1) %>%
  plot_dotsAll() +  
  plot_annotation(title=glue::glue("Highest rated pairs ( out of sets with n>={min_resp} ratings)"), theme = theme(plot.title = element_text(hjust = 0.4)))


```



## All pairs with >=10 ratings

- click on column headers to change sort order
  - e.g. clicking on "Mean Rating" will toggle showing the pairs rated most similar or most dissimilar
  - clicking on "SD" will toggle showing the pairs with the most or least agreement in ratings

::: column-page-right

```{r}
pat_table_plot <- function(Pair){

  df <- d |> filter(pair_label==Pair) |> slice_head(n=1) 

    pat1 <- df %>%
          mutate(pattern_1 = purrr::map(pattern_1, jsonlite::fromJSON)) %>%
          unnest(pattern_1) %>%
          mutate(y=-y, pat=item_label_1) |> select(pair_label,x,y,pat)

    pat2 <- df %>%
          mutate(pattern_2 = purrr::map(pattern_2, jsonlite::fromJSON)) %>%
          unnest(pattern_2) %>%
          mutate(y=-y, pat=item_label_2) |> select(pair_label,x,y,pat)


    pat <- rbind(pat1,pat2)

     pat |> 
    ggplot(aes(x = x, y = y)) +
          geom_point(alpha=2) +
          coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
          theme_minimal() +
          facet_wrap(~pat,ncol=2) + 
          #theme_blank +
          theme_void() + 
          theme(strip.text = element_text(size = 7,hjust=.5),
                panel.spacing.x=unit(-7.3, "lines")) 
}



p5 <- pairCounts |> filter(n>=10) 


p5 |> 
relocate(pair_label,.after=sd) |>
rename("Pair"=pair_label, "N"=n, "Mean Rating"=mean_resp, "SD"=sd) |>
#group_by(Pair) |> 
gt() |> 
tab_options(table.font.size = px(8L)) |>
  cols_width(
    set ~ px(116),
    Pair ~ px(415),
    `N` ~ px(50),
    `Mean Rating` ~ px(90),
    SD ~ px(55)
  ) |>  
  fmt_number(decimals = 1) |> #fmt_integer() |>
  cols_align('left', columns = set) |> 
  text_transform(
    locations = cells_body(columns = Pair),
    fn = function(column) {
      map(column, pat_table_plot) |>
        ggplot_image(height = px(230), aspect_ratio = 1.8)
    }
  ) |>  
  opt_interactive(page_size_default=5, 
    use_page_size_select= TRUE, use_search=TRUE, use_resizers=TRUE,use_filters=TRUE, page_size_values = c(5, 10, 25, 50, 100)) 
    


```

:::



### Correlations with Accuracy in Hu & Nosofsky 2024

- not really enough data for this yet


#### Assess # of patterns in various binnings - e.g. quartile, decile
```{r}
patternAvg <- d |> 
  pivot_longer(cols=c(item_label_1, item_label_2), names_to="item_label", values_to="item") |> 
  group_by(item,file) |> 
  summarise(n_rating=n(),resp=mean(response),sd=sd(response)) |> 
  arrange(desc(n_rating))


cat_sim <- sbj_cat |> 
  mutate(item=item_label) |>
  left_join(patternAvg,by=c("file","item"))  |> arrange(desc(n_rating)) |>
  #remove rows where n_rating is NA, or less than 4
  filter(!is.na(n_rating),n_rating>=12) |> 
  mutate(sim_group = ifelse(resp>6.0,"Very Similar",ifelse(resp<3.5,"Very Dissimilar","Medium"))) |> 
  mutate(sim_group=factor(sim_group,levels=c("Very Dissimilar","Medium","Very Similar"))) 


# bin data by rating (resp) into quartiles
t1 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 5))|>
  group_by(Quartile) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 

t2 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10))|>
  group_by(Decile) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 


t3 <- cat_sim |> 
  group_by(sim_group) |>
  summarize("Avg. Similarity Rating"=mean(resp),sd=sd(resp),n_ratings=n_distinct(file), .groups="drop") 

t1 |> kbl(caption="Quartiles")
t2 |> kbl(caption="Deciles")
t3 |> kbl(caption="Extreme Groups")
```

#### Combined Category Testing Performance with Similarity Ratings (resp)

- Each subject in the 2024 study has a similarity score for each of their 3 categories. (averaged over 2 comparisons with that categories prototype)
- The same category similarity scores are then compared to their accuracy for each of the Pattern Tyeps (old, prototype, new low, new med, new high)
```{r}
cat_sim %>% # round all numerics except sbjCode to 2 decimal places
 mutate(across(where(is.numeric), ~round(., 1))) |> select(-id,-sim_group,-item_label) |> 
  relocate(item,file, .after=sd) |>
  rename("Category Similarity" = resp, "CatLearn Accuracy" = Corr) |>
   DT::datatable(options = list(pageLength = 6))


```



```{r}
#| fig-width: 10
#| fig-height: 8



p3 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_wrap(~Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Quintile", title="Effect by Pattern Type")


p4 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10)) |> 
  ggplot(aes(x=Decile,y=Corr,fill=Decile)) + 
  stat_bar + 
  facet_wrap(~Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Decile", title="Effect by Pattern Type")





p5 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_wrap(~condit) + labs(y="CatLearn Accuracy", x="Similarity Rating Quartile", title="Effect by Training Condition")


p6 <- cat_sim |> 
  mutate(Decile = ntile(resp, 10)) |> 
  ggplot(aes(x=Decile,y=Corr,fill=Decile)) + 
  stat_bar + 
  facet_wrap(~condit) + labs(y="CatLearn Accuracy", x="Similarity Rating Decile", title="Effect by Training Condition")


p7 <- cat_sim |> 
  mutate(Quartile = ntile(resp, 4)) |> 
  ggplot(aes(x=Quartile,y=Corr,fill=Quartile)) + 
  stat_bar + 
  facet_nested_wrap(~condit+Pattern.Type) + labs(y="CatLearn Accuracy", x="Similarity Rating Quintile", title="Effect by Training Condition and Pattern Type")


p3 + p4
p5 + p6
p7






p9 <- cat_sim |> 
  ggplot(aes(y=Corr,x=Pattern.Type, fill=sim_group)) + 
  stat_bar + labs(title="Group by Pattern Type",y="CatLearn Accuracy")

p10 <- cat_sim |> 
  ggplot(aes(y=Corr,x=condit, fill=sim_group)) + 
  stat_bar + labs(title="Group by Condit",y="CatLearn Accuracy")

 p9 / p10

```



```{r}
#| fig-width: 10
#| fig-height: 7


cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=Pattern.Type)) + 
  geom_smooth(aes(fill=Pattern.Type),method = "lm") + 
 labs(y="Similarity Rating", x="Aggregated Accuracy in CatLearn study")

cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=Pattern.Type)) + 
  geom_smooth(aes(fill=Pattern.Type),method = "lm") + 
  facet_wrap(~condit) + labs(y="Similarity Rating", x="Aggregated Accuracy in CatLearn study")




cat_sim |> ggplot(aes(x=Corr,y=resp)) + 
  geom_point(aes(col=condit)) + 
  #geom_smooth(aes(fill=Pattern.Type)) + 
  facet_wrap(~Pattern.Type)

```








<!-- ### Plot Pairs -->
```{r}
# #| fig-cap: dot plots
# #| fig-width: 10
# #| fig-height: 12

# d %>% filter(trial==1) %>%
#   plot_dots()

# d %>% filter(trial==1) %>%
#   plot_dots2()

# d %>% filter(trial<2) %>%
#   plot_dotsAll()

```



<!-- ### Compare to original prototypes -->
```{r}
#| fig-width: 6
#| fig-height: 9


# d %>% filter(file %in% unique(d$file[1])) %>%
#   plot_dotsAll()

# plot_dotsAll_orig <- function(df) {
#   plots <- list()

#   for (i in 1:nrow(df)) {
#     p1 <- df[i, ] %>%
#       pivot_longer(cols = starts_with("x"), names_to = "dot", values_to = "x") %>%
#       mutate(dot = as.numeric(str_remove(dot, "x"))) %>%
#       pivot_longer(cols = starts_with("y"), names_to = "dot2", values_to = "y") %>%
#       mutate(dot2 = as.numeric(str_remove(dot2, "y"))) %>%
#       filter(dot == dot2) %>%
#       ggplot(aes(x = x, y = y)) +
#       geom_point() +
#       coord_cartesian(xlim = c(-25, 25), ylim = c(-25, 25)) +
#       theme_minimal() +
#       labs(title = df$id[i]) + theme_blank

#     plots <- append(plots, list(p1))
#   }

#   patchwork::wrap_plots(plots, ncol = 1)
# }

# mc24_proto |> filter(file %in% unique(d$file[1])) %>% plot_dotsAll_orig()


```




© 2024 Thomas Gorman

site created with R and quarto

  • View source
  • Edit this page
  • Report an issue